library(tidyverse) #For Almost everything
library(rvest) #For Web Scraping
library(DT) #For formating tables plotting in html
library(lubridate) #For Date and Times manipulation
library(tictoc) #For calculating time execution
library(maptools) #For reading shape files
library(ggmap) #Get the geolocation
Sys.setlocale("LC_CTYPE","Arabic_Saudi Arabia") #change the locale system to get proper arabic
[1] "Arabic_Saudi Arabia.1256"
# Store web url
immo_url <- ("https://www.ouedkniss.com/annonces/index.php?c=immobilier&sc=vente&sc2=appartement&wilaya=%2Calger&prix=1&prix_unite=2&p=1")
immo_url
[1] "https://www.ouedkniss.com/annonces/index.php?c=immobilier&sc=vente&sc2=appartement&wilaya=%2Calger&prix=1&prix_unite=2&p=1"
Links <- immo_url %>% read_html(.,encoding = "UTF-8") %>% #Read the html of the url
html_nodes(".button_details") %>% #Select button_details element
html_attr("href") #Extract all hyperlinks text
Links # print the content of Links
[1] "vente-appartement-f3-alger-bab-ezzouar-algerie-immobilier-d16678458" "vente-appartement-f3-alger-birtouta-algerie-immobilier-d16680050"
[3] "vente-appartement-f2-alger-birtouta-algerie-immobilier-d14890818" "vente-appartement-f4-alger-birtouta-algerie-immobilier-d14988773"
[5] "vente-appartement-f4-alger-centre-algerie-immobilier-d16676423" "vente-appartement-f3-alger-baba-hassen-algerie-immobilier-d16678782"
[7] "vente-appartement-f2-alger-bab-el-oued-algerie-immobilier-d16675650" "vente-appartement-f4-alger-bouzareah-algerie-immobilier-d16678633"
[9] "vente-appartement-f2-alger-centre-algerie-immobilier-d16675612" "vente-appartement-f5-alger-kouba-algerie-immobilier-d14025834"
[11] "vente-appartement-f3-alger-birkhadem-algerie-immobilier-d16270056" "vente-appartement-f2-alger-oued-smar-algerie-immobilier-d16678170"
[13] "vente-appartement-f3-alger-draria-algerie-immobilier-d15990036" "vente-appartement-f3-alger-bordj-el-kiffan-algerie-immobilier-d16677974"
[15] "vente-appartement-f3-alger-bordj-el-kiffan-algerie-immobilier-d16193073" "vente-appartement-f4-alger-el-mouradia-algerie-immobilier-d16391032"
[17] "vente-appartement-f4-alger-zeralda-algerie-immobilier-d13299882" "vente-appartement-f3-alger-ain-naadja-algerie-immobilier-d16677515"
[19] "vente-appartement-f5-alger-el-mouradia-algerie-immobilier-d15734403" "vente-appartement-f3-alger-baraki-algerie-immobilier-d16676668"
[21] "vente-appartement-f3-alger-zeralda-algerie-immobilier-d15967704" "vente-appartement-f3-alger-bab-ezzouar-algerie-immobilier-d16675834"
[23] "vente-appartement-f3-alger-el-harrach-algerie-immobilier-d13034379" "vente-appartement-f2-alger-baraki-algerie-immobilier-d16675458"
[25] "vente-appartement-f2-alger-said-hamdine-algerie-immobilier-d16675267" "vente-appartement-f5-alger-bab-ezzouar-algerie-immobilier-d16675171"
[27] "vente-appartement-f3-alger-ouled-fayet-algerie-immobilier-d15721644" "vente-appartement-f2-alger-el-biar-algerie-immobilier-d11289501"
[29] "vente-appartement-f2-alger-baraki-algerie-immobilier-d16540460" "vente-appartement-f4-alger-baraki-algerie-immobilier-d11916108"
Links <- Links %>%
unlist() %>% #transform Links to a vector
paste("https://www.ouedkniss.com/", ., sep = "") #concatenate the website root address with the links
Links
[1] "https://www.ouedkniss.com/vente-appartement-f3-alger-bab-ezzouar-algerie-immobilier-d16678458"
[2] "https://www.ouedkniss.com/vente-appartement-f3-alger-birtouta-algerie-immobilier-d16680050"
[3] "https://www.ouedkniss.com/vente-appartement-f2-alger-birtouta-algerie-immobilier-d14890818"
[4] "https://www.ouedkniss.com/vente-appartement-f4-alger-birtouta-algerie-immobilier-d14988773"
[5] "https://www.ouedkniss.com/vente-appartement-f4-alger-centre-algerie-immobilier-d16676423"
[6] "https://www.ouedkniss.com/vente-appartement-f3-alger-baba-hassen-algerie-immobilier-d16678782"
[7] "https://www.ouedkniss.com/vente-appartement-f2-alger-bab-el-oued-algerie-immobilier-d16675650"
[8] "https://www.ouedkniss.com/vente-appartement-f4-alger-bouzareah-algerie-immobilier-d16678633"
[9] "https://www.ouedkniss.com/vente-appartement-f2-alger-centre-algerie-immobilier-d16675612"
[10] "https://www.ouedkniss.com/vente-appartement-f5-alger-kouba-algerie-immobilier-d14025834"
[11] "https://www.ouedkniss.com/vente-appartement-f3-alger-birkhadem-algerie-immobilier-d16270056"
[12] "https://www.ouedkniss.com/vente-appartement-f2-alger-oued-smar-algerie-immobilier-d16678170"
[13] "https://www.ouedkniss.com/vente-appartement-f3-alger-draria-algerie-immobilier-d15990036"
[14] "https://www.ouedkniss.com/vente-appartement-f3-alger-bordj-el-kiffan-algerie-immobilier-d16677974"
[15] "https://www.ouedkniss.com/vente-appartement-f3-alger-bordj-el-kiffan-algerie-immobilier-d16193073"
[16] "https://www.ouedkniss.com/vente-appartement-f4-alger-el-mouradia-algerie-immobilier-d16391032"
[17] "https://www.ouedkniss.com/vente-appartement-f4-alger-zeralda-algerie-immobilier-d13299882"
[18] "https://www.ouedkniss.com/vente-appartement-f3-alger-ain-naadja-algerie-immobilier-d16677515"
[19] "https://www.ouedkniss.com/vente-appartement-f5-alger-el-mouradia-algerie-immobilier-d15734403"
[20] "https://www.ouedkniss.com/vente-appartement-f3-alger-baraki-algerie-immobilier-d16676668"
[21] "https://www.ouedkniss.com/vente-appartement-f3-alger-zeralda-algerie-immobilier-d15967704"
[22] "https://www.ouedkniss.com/vente-appartement-f3-alger-bab-ezzouar-algerie-immobilier-d16675834"
[23] "https://www.ouedkniss.com/vente-appartement-f3-alger-el-harrach-algerie-immobilier-d13034379"
[24] "https://www.ouedkniss.com/vente-appartement-f2-alger-baraki-algerie-immobilier-d16675458"
[25] "https://www.ouedkniss.com/vente-appartement-f2-alger-said-hamdine-algerie-immobilier-d16675267"
[26] "https://www.ouedkniss.com/vente-appartement-f5-alger-bab-ezzouar-algerie-immobilier-d16675171"
[27] "https://www.ouedkniss.com/vente-appartement-f3-alger-ouled-fayet-algerie-immobilier-d15721644"
[28] "https://www.ouedkniss.com/vente-appartement-f2-alger-el-biar-algerie-immobilier-d11289501"
[29] "https://www.ouedkniss.com/vente-appartement-f2-alger-baraki-algerie-immobilier-d16540460"
[30] "https://www.ouedkniss.com/vente-appartement-f4-alger-baraki-algerie-immobilier-d11916108"
# we will show how it works for one link
Page <- read_html(Links[1],encoding = "UTF-8")
Page #print the content of Page
{xml_document}
<html lang="fr">
[1] <head>\n<base href="//www.ouedkniss.com/">\n<meta http-equiv="Content-Type" content="text/html; charset=utf-8">\n<meta name="google-site-verification" content="z ...
[2] <body>\n\t<script>user_agent='desktop';</script><script>var user_agent_forced=false;</script><script>language='fr';</script><div id="HTML" class="html">\n\t\t\n\ ...
#Let's extract the Title
Title <- Page %>% #Is our Html document
html_nodes("#Title") %>% #We are selecting the Title element
html_text() #Extracting the text from the Title element in the Html document
Title
[1] "Vente Appartement F3 Alger Bab ezzouar"
#Getting the Description
Description <- Page %>%
html_nodes("#GetDescription") %>%
html_text()
Description
[1] "Cité Rabia Tahar, Bab-Ezzouar, vend un Appartement de type F-3 de 75 m2, 5ème étage, Acte. Prix : 1,8 Milliard négociable."
#Geting the Price with the details
Details <- Page %>%
html_nodes("#Prix,#Description p") %>%
html_text()
Details
[1] "Numéro : 16678458" "Nombre de vues : 33" "Déposée le : 03-08-2018 à 18:30" " Quartier : Cité Rabia Tahar"
[5] " Nombre de pièces : 3" " Nombre d'étages / étage : 5" " Superficie : 75 M²" " Spécifications : Acte notarié "
[9] "Prix : 18 Millions Négociable"
#Getting the store name
Store.Name <- html_nodes(Page, "#store_name") %>% #Yes, it can be written also like this
html_text()
Store.Name
[1] "Day Immobilier"
#Getting the store address
Store.Addresse<- Page %>%
html_nodes("#store_adresse") %>%
html_text()
Store.Addresse
[1] "Alger Bab ezzouar cité 5 juillet Bt 110 Bab Ezzouar"
#Getting the pseudo for the owners
Pseudo <- Page %>%
html_nodes(".Pseudo") %>%
html_text()
Pseudo <- NA
#Now we need to group all the elements extracted in one table
#We have different ttype of elements with different length
map_dbl(list(Links[1],Title, Description,Details,Store.Name, Pseudo ), length)
[1] 1 1 1 9 1 1
#we can use a list to group all elements but better is to use a tibble, a new enhanced data frame structure
#that accepts the columns to be list
one_link_data <- tibble(Links[1],Title, Description,Details,Store.Name, Pseudo)
one_link_data
# replace empty details with NA
one_link_data <- one_link_data %>%
mutate(Title = as.character(ifelse(Title=="character(0)", NA, Title)),
Description = as.character(ifelse(Description=="character(0)", NA, Description)),
Details = ifelse(Details=="character(0)", NA, Details),
Store.Name = as.character(ifelse(Store.Name=="character(0)", NA, Store.Name)),
Store.Addresse = as.character(ifelse(Store.Addresse=="character(0)", NA, Store.Addresse)),
Pseudo = as.character(ifelse(Pseudo=="character(0)", NA, Pseudo)),
# Extract details elements and values from post Description
Det = map(Details, function(x) str_trim(str_split_fixed(string = x[], pattern = " : ", n=2)[,1])),
Values = map(Details, function(x) str_trim(str_split_fixed(string = x[], pattern = " : ", n=2)[,2]))
# Det and Values are list columns , we have nested data frame now
) %>%
select(-Details)
one_link_data
# Get rid of nested columns by unnesting the data
one_link_data <- one_link_data %>% unnest()
one_link_data
# Do some cleansing
one_link_data <- one_link_data %>%
filter(Det != "")%>% # remove white space
mutate(Det = case_when( # replace with correct spelling
.$Det == "Nombre de pièces" ~ "Nombre de pièces",
.$Det == "Spécifications" ~ "Spécifications",
.$Det == "Nombre d'étages / étage" ~ "Nombre d'étages / étage",
.$Det == "Déposée le" ~ "Déposée le",
.$Det == "Numéro" ~ "Numéro",
TRUE ~ as.character(.$Det)
)) %>%
# the next three lines to remove some duplicates
group_by(Links[1], Det) %>%
arrange(desc(Values)) %>%
slice(1)
one_link_data
#choose the record with the highest Values and drop the other duplicates
#tidying the data, now we will have 1 row per housing listing and all other details on columns
one_link_data <- one_link_data %>% spread(Det,value = Values)
one_link_data
NA
scrape_houses <- function(nb.page = 1){
Sys.setlocale("LC_CTYPE","Arabic_Saudi Arabia") #change the locale system to get proper arabic text
# Store web url
immo_url <- ("https://www.ouedkniss.com/annonces/index.php?c=immobilier&sc=vente&sc2=appartement&wilaya=%2Calger&prix=1&prix_unite=2&p=") %>%
paste (. , nb.page, sep = "")
# Scrape the post links from the page urls
#Getting all the links of the root page from the details button
#We are going to use possibly which replace any errors in the scraping with a lign of NA's, in order to avoid
#the early stopping of the scraping due to the errors
tic("links") # Encapsuling each step bewteen tic toc commandes to calculate the execution time
Links <- map(immo_url, possibly(.%>%read_html(.,encoding = "UTF-8") %>%
html_nodes(".button_details") %>%
html_attr("href")
,NA_real_)) %>%
unlist() %>%
paste("https://www.ouedkniss.com/", ., sep = "")
toc()
tic("pages")
#Getting all the pages of the links
#In order to do it for all tha links we need to use map function
Pages <- map(Links, #First argument of the map function is our data
possibly( # We are using possibly to change the behavior of any error message
read_html(.,encoding = "UTF-8") , #the second argument of the map function is the function that we want to apply
NA_real_)) #We are going to fill our data with NA's if any error ocuur
toc()
# Getting the post title
tic("titles")
Title <- map(Pages,possibly(. %>%
html_nodes("#Title") %>%
html_text()
,NA_real_))
toc()
#Getting the Description
tic("description")
Description <- map(Pages,possibly(. %>%
html_nodes("#GetDescription") %>%
html_text()
,NA_real_))
toc()
#Geting the Price with the details
tic("details")
Details <- map(Pages,possibly(. %>%
html_nodes("#Prix,#Description p") %>%
html_text()
,NA_real_))
toc()
#Getting the store name
tic("store.name")
Store.Name<- map(Pages,possibly(. %>%
html_nodes("#store_name") %>%
html_text()
,NA_real_))
toc()
#Getting the store address
tic("store.addresse")
Store.Addresse<- map(Pages,possibly(. %>%
html_nodes("#store_adresse") %>%
html_text()
,NA_real_))
toc()
#Getting the pseudo for the owners
tic("pseudo")
Pseudo <- map(Pages,possibly(. %>%
html_nodes(".Pseudo") %>%
html_text()
,NA_real_))
toc()
# adding everything to a tibble and reshaping the data
tic("constructing tibble")
housing_data <- tibble(Links,Title, Description,Details,Store.Name, Pseudo) %>%
# replace empty details with NA
mutate(Title = as.character(ifelse(Title=="character(0)", NA, Title)),
Description = as.character(ifelse(Description=="character(0)", NA, Description)),
Details = ifelse(Details=="character(0)", NA, Details),
Store.Name = as.character(ifelse(Store.Name=="character(0)", NA, Store.Name)),
Store.Addresse = as.character(ifelse(Store.Addresse=="character(0)", NA, Store.Addresse)),
Pseudo = as.character(ifelse(Pseudo=="character(0)", NA, Pseudo)),
# Extract details elements and values from post Description
Det = map(Details, function(x) str_trim(str_split_fixed(string = x[], pattern = " : ", n=2)[,1])),
Values = map(Details, function(x) str_trim(str_split_fixed(string = x[], pattern = " : ", n=2)[,2]))
# Det and Values are list columns , we have nested data frame now
) %>%
select(-Details) %>%
# Get rid of nested columns by unnesting the data
unnest() %>%
# Do some cleansing
filter(Det != "")%>%
mutate(Det = case_when(
.$Det == "Nombre de pièces" ~ "Nombre de pièces",
.$Det == "Spécifications" ~ "Spécifications",
.$Det == "Nombre d'étages / étage" ~ "Nombre d'étages / étage",
.$Det == "Déposée le" ~ "Déposée le",
.$Det == "Numéro" ~ "Numéro",
TRUE ~ as.character(.$Det)
)) %>%
# the next three lines to remove some duplicates
group_by(Links, Det) %>%
arrange(desc(Values)) %>%
slice(1) %>% # choose the record with the highest Values and drop the other duplicates
# tidying the data, now we will have 1 row per housing listing and all other details on columns
spread(Det,value = Values)
toc()
tic("return")
return(housing_data)
toc()
}
#Create a gentle scraping function
Asber_Chouia <- function (periods = c(1,1.5)) {
SleepCalls <- runif(1, periods[1],periods[2]) # generate a uniform random value between period 1 and period 2
# some prinitng to seperate the execution
cat(paste("----------------------------------------------------------------------------------------","",sep = "\n"))
cat(paste0(Sys.time()),"Rani Saber", round(SleepCalls,2), "Seconds\n")
cat(paste("","", sep = "\n"))
Sys.sleep(SleepCalls) #Cause the sytem to sleep before continue the script execution
}
#Wrap everything into a function
GentleScraping <- function(Start_Sleep=1, Finish_Sleep=1.5, Page){
Asber_Chouia (c(Start_Sleep, Finish_Sleep))
Algiers_SalesAppartments <- scrape_houses(Page)
return(Algiers_SalesAppartments)
}
starttime<- Sys.time() # To count execution time
#Apply the scraping function for 100 pages (that correspond to one year listing)
data_scraped <- vector("list",100) #First crate an empty list of 100 element
for (i in 1:100)
{
cat(paste("","", sep = "\n"))
cat(paste0("Scraping Page"," ",i))
cat(paste("","", sep = "\n"))
# iterate through all the pages, in each page there is 30 links to listing houses
# for each 30 links scraped the scraping will pause for a period of few seconds
# before continuing the scraping
data_scraped[[i]]<- GentleScraping (Start_Sleep =1 ,Finish_Sleep =1.5 ,Page = i) # iterate through
}
endtime<- Sys.time()
endtime - starttime # Give the scraping time
Algiers_SaleAppartment <- bind_rows(data_scraped) # append all the element of the list into one big data frame
#Save the data to the disk
write.csv(Algiers_SaleAppartment, "Algiers_SalesAppartments26072018.csv", row.names = FALSE, fileEncoding = "UTF-8")
housing <- read.csv("Algiers_SalesAppartments26072018.csv", fileEncoding = "UTF-8") ### reading the data
names(housing) [7:15]<- c("Date", "Nb.Floor", "Nb.Room", "Nb.Views", "ID.Offer", "Price", "District", "Specifics", "Area") ### rename some variables
#create some new variables based on the information found on the existing ones
housing <- housing %>%
mutate(Price.value = as.numeric(str_split_fixed(str_trim(housing$Price), " ",3)[,1]),
Price.unit= factor(str_split_fixed(str_trim(housing$Price), " ",3)[,2]),
Price.desc= factor(str_split_fixed(str_trim(housing$Price), " ",3)[,3]),
Area = as.numeric(str_split_fixed(str_trim(housing$Area), " ",2)[,1])
) %>%
mutate(Price.value.dzd = # Create some rules to clean the price
case_when(.$Price.unit == "Milliards" & .$Price.value <=10
~ .$Price.value *10000000,
str_detect(.$Price.desc, "m²") == TRUE & .$Price.value <= 35
~ .$Price.value * .$Area*10000,
str_detect(.$Price.desc, "m²") == TRUE & .$Price.value > 35 & .$Price.value <10000
~ .$Price.value *10000,
str_detect(.$Price.desc, "m²") == TRUE & .$Price.value >10000
~ .$Price.value * .$Area,
.$Price.unit == "Millions"& .$Price.value <=10
~ .$Price.value * 10000000,
.$Price.unit == "Millions"& .$Price.value >100
~ .$Price.value * 10000,
.$Price.unit == "Millions"& .$Price.value >10 & .$Price.value <100
~.$Price.value * 1000000,
TRUE ~ .$Price.value
),
# Create a variable that containg nominal rules applied to Price variable, so that we can use it in our sanity check
Price.rules =
case_when(.$Price.unit == "Milliards" & .$Price.value <=10
~ "Price.value *10000000",
str_detect(.$Price.desc, "m²") == TRUE & .$Price.value <= 35
~ "Price.value * Area*10000",
str_detect(.$Price.desc, "m²") == TRUE & .$Price.value > 35 & .$Price.value <10000
~ "Price.value *10000",
str_detect(.$Price.desc, "m²") == TRUE & .$Price.value >10000
~ "Price.value * Area",
.$Price.unit == "Millions"& .$Price.value <=10
~ "Price.value * 10000000",
.$Price.unit == "Millions"& .$Price.value >100
~ "Price.value * 10000",
.$Price.unit == "Millions"& .$Price.value >10 & .$Price.value <100
~"Price.value * 1000000",
TRUE ~ "Price.value"
),
# Create even more variables
# Announcer.Name = ifelse(is.na(Pseudo) == FALSE,Pseudo ,as.character(Store.Name),
# Announcer.Type = case_when(is.na(Store.Addresse) == FALSE & str_detect(toupper(Store.Name), "AGENCE|AG") == TRUE ~ "AGENCE",
# is.na(Store.Addresse) == FALSE & str_detect(toupper(Store.Name), "PROMOTION") == TRUE ~ "PROMOTEUR " ,
# TRUE ~ "BUREAU D'AFFAIRE"),
Hour = str_sub(str_trim(str_split_fixed(Date,"à", 2)[,2]),1,2),
Date = dmy(str_trim(str_split_fixed(Date,"à", 2)[,1])),
Month = month(Date,label = TRUE),
Municipality=factor(str_trim(str_extract(Title,"(Alger.*)"))),
Garage=str_detect(Specifics, "Garage"),
Garden=str_detect(Specifics, "Jardin") ,
Furnished=str_detect(Specifics, "Meublé"),
Promise=str_detect(Specifics, "Promesse de vente"),
New.Project=str_detect(Specifics, "Promotion"),
Payment=factor(ifelse(str_detect(Specifics,"Paiement par tranches") == TRUE, "tranches", "comptant"))
)
housing
write.csv(housing, "Algiers_SalesAppartments26072018_c.csv", row.names = FALSE, fileEncoding = "UTF-8")
housing %>% select(Date,Nb.Floor,Nb.Room,Nb.Views) %>% summary()
Date Nb.Floor Nb.Room Nb.Views
Min. :2017-07-18 Min. : 1 Min. :1.000 Min. : 9.0
1st Qu.:2018-04-11 1st Qu.: 1 1st Qu.:3.000 1st Qu.: 104.0
Median :2018-06-24 Median : 3 Median :3.000 Median : 241.0
Mean :2018-05-15 Mean : 240520 Mean :3.269 Mean : 911.8
3rd Qu.:2018-07-19 3rd Qu.: 4 3rd Qu.:4.000 3rd Qu.: 736.0
Max. :2018-07-26 Max. :552625820 Max. :8.000 Max. :57688.0
NA's :402 NA's :93 NA's :144
housing %>% select(Area,Price.value.dzd,Municipality) %>% summary()
Area Price.value.dzd Municipality
Min. :1.000e+00 Min. : 1 Alger Alger centre : 223
1st Qu.:7.000e+01 1st Qu.: 8500000 Alger Bab ezzouar : 153
Median :8.000e+01 Median : 14000000 Alger Cheraga : 131
Mean :1.115e+06 Mean : 16836916 Alger Bordj el bahri : 111
3rd Qu.:1.050e+02 3rd Qu.: 22000000 Alger Bordj el kiffan: 110
Max. :2.147e+09 Max. :100000000 Alger El achour : 107
NA's :179 (Other) :1865
housing %>% select(Garage:Payment) %>% summary()
Garage Garden Furnished Promise New.Project Payment
Mode :logical Mode :logical Mode :logical Mode :logical Mode :logical comptant:2310
FALSE:2031 FALSE:2197 FALSE:2273 FALSE:2257 FALSE:2098 tranches: 152
TRUE :431 TRUE :265 TRUE :189 TRUE :205 TRUE :364 NA's : 238
NA's :238 NA's :238 NA's :238 NA's :238 NA's :238
## remove abnormal data
housing2 <- housing %>%
filter(Price.value.dzd <100000000 & Price.value.dzd >= 4000000,Area >30 & Area <= 300, #Applying some filters
str_trim(Nb.Room) %in% c("2","3","4","5")) %>%
mutate(Nb.Room = factor(Nb.Room))
housing2$Nb.Room <- ordered(str_trim(housing2$Nb.Room) ,levels =1:7) #Ordering the levels of the Nb.Room variable
# construct a function for multiple pattern replacement
mgsub <- function(pattern, replacement, x, ...) {
if (length(pattern)!=length(replacement)) {
stop("pattern and replacement do not have the same length.")
}
result <- x
for (i in 1:length(pattern)) {
result <- gsub(pattern[i], replacement[i], result, ...)
}
result
}
#read the shape file for Algiers
sh<-readShapePoly("C:/Users/fateh/Documents/R Scripts/Shapefile/algeria/communes.shp")
sh<-sh[sh@data$wilaya=="ALGER",]
#Do some cleansing and preparation in order to match the names of the municipalities in ouedkniss to those
#in our shape file
housing2$Municipality<- toupper(str_split_fixed(string =housing2$Municipality, pattern = " ",n = 2 )[,2])
housing2$Municipality <- factor(ifelse(str_detect (string = housing2$Links,pattern = "alger-centre") == TRUE & is.na(housing2$District) == TRUE,
"ALGER CENTRE", as.character(housing2$Municipality)))
#Correct the naming
housing2$Municipality<- mgsub(c("GUE DE CONSTANTINE", "BAB EZZOUAR","BACHDJERRAH","HAMMAMET", "BIRKHADEM",
"BELOUIZDAD","BOLOGHINE","ALGER CENTRE", "CHEVALLEY","HRAOUA","TESSALA EL MERDJA","SAID HAMDINE","AIN NAADJA"),
c("DJISR KSENTINA","BEB EZZOUAR","BACH DJERRAH", "BAINS ROMAINS", "BIR KHADEM", "HAMMA ANASSERS",
"BOLOGHINE IBN ZIRI", "ALGER" ,"BOUZAREAH", "HARAOUA", "TASSALA EL MERDJA","BIR MOURAD RAIS","DJISR KSENTINA"),
housing2$Municipality)
housing2$Municipality <- factor(ifelse(housing2$Municipality== "","ALGER", as.character(housing2$Municipality)))
##Creating top municipalities variable
housing2<- housing2%>% mutate(Top_Municipalities = fct_lump(housing2$Municipality, 10))
housing2 %>% select(Date,Nb.Floor,Nb.Room,Nb.Views) %>% summary()
Date Nb.Floor Nb.Room Nb.Views
Min. :2017-07-18 Min. : 1.00 1: 0 Min. : 9.0
1st Qu.:2018-04-15 1st Qu.: 1.00 2: 350 1st Qu.: 104.0
Median :2018-06-24 Median : 3.00 3:1051 Median : 242.0
Mean :2018-05-16 Mean : 46.21 4: 611 Mean : 931.9
3rd Qu.:2018-07-19 3rd Qu.: 4.00 5: 140 3rd Qu.: 724.0
Max. :2018-07-26 Max. :14567.00 6: 0 Max. :57688.0
NA's :292 7: 0 NA's :112
housing2 %>% select(Area,Price.value.dzd,Municipality) %>% summary()
Area Price.value.dzd Municipality
Min. :3.100e+01 Min. : 4000000 ALGER : 176
1st Qu.:7.000e+01 1st Qu.:10000000 BEB EZZOUAR : 112
Median :8.000e+01 Median :15500000 CHERAGA : 111
Mean :1.306e+06 Mean :18606147 DJISR KSENTINA : 100
3rd Qu.:1.050e+02 3rd Qu.:23500000 BORDJ EL BAHRI : 96
Max. :2.147e+09 Max. :95000000 BORDJ EL KIFFAN: 95
(Other) :1462
housing2 %>% select(Garage:Payment) %>% summary()
Garage Garden Furnished Promise New.Project Payment
Mode :logical Mode :logical Mode :logical Mode :logical Mode :logical comptant:1853
FALSE:1633 FALSE:1769 FALSE:1841 FALSE:1819 FALSE:1680 tranches: 127
TRUE :347 TRUE :211 TRUE :139 TRUE :161 TRUE :300 NA's : 172
NA's :172 NA's :172 NA's :172 NA's :172 NA's :172
data.frame(Top_Municipalities =table(housing2$Top_Municipalities))## check
unique(sort(factor(housing2$Municipality)))
unique(sort(factor(sh@data$commune0)))## check again nehi twesswiss
#Get the geolocation of the
commune.names <- paste("Algiers,", as.character(unique(housing2$Municipality)), sep=" ")
commune.names[commune.names == "Algiers, ALGER"] <- "ALGER CENTRE, Algiers"
commune.info <- geocode(commune.names, output = "more", override_limit = TRUE)
#Get the localities that was not geocoded in the dirst run
commune.names.missing <- cbind(commune.names,commune.info) %>%
filter(is.na(lon))
#Second geocode pass
commune.info.missing <- geocode(as.character(commune.names.missing$commune.names), output = "more", override_limit = TRUE)
#Bind rows first and second geocoding pass
commune.info_final<-
cbind(commune.names,commune.info) %>%
filter(!is.na(lon)) %>%
bind_rows(.,cbind(commune.names=commune.names.missing$commune.names,commune.info.missing))
#Do more cleansing
commune.info_final <-
mutate(commune.info_final,commune.names = factor(ifelse(commune.names == "ALGER CENTRE, Algiers",
"Algiers, ALGER",
as.character(commune.names))))
commune.info_final$Municipality <- trimws(str_split_fixed(commune.info_final$commune.names,",",2))[,2]
#Create an augmented data set with geolocation data
Housing_Data <-
housing2%>%
left_join(., commune.info_final) %>%
#creating Announcer name and announcer Type variable
mutate(Announcer.Name = ifelse(is.na(Pseudo) == FALSE,as.character(Pseudo) ,as.character(Store.Name)),
Announcer.Type = case_when(str_detect(toupper(Store.Name), "AGENCE|AG") == TRUE ~ "AGENCE",
str_detect(toupper(Store.Name), "PROMOTION") == TRUE ~ "PROMOTEUR" ,
str_detect(toupper(Store.Name), "BUREAU|AFFAIRE") == TRUE ~ "BUREAU D'AFFAIRE",
is.na(Pseudo) == FALSE ~ "PARTICULIER",
TRUE ~ "BUREAU D'AFFAIRE")
)
commune.info_final
Housing_Data
#Getting the Municipality of the Store by looking for similar address pattern between the House Municipality and the Store Address
Housing_Data$Store.Municipality <-
str_match(toupper(housing2$Store.Addresse),
pattern = paste(c(unique(as.character(housing2$Municipality))[-10],
c("GUE DE CONSTANTINE", "BAB EZZOUAR","BACHDJERRAH","HAMMAMET", "BIRKHADEM", "BELOUIZDAD","BOLOGHINE",
"ALGER CENTRE", "CHEVALLEY","HRAOUA","TESSALA EL MERDJA","SAID HAMDINE","AIN NAADJA")) ,
collapse = "|")) [,1]
#Matching the Store Municipality names with the Housing Municipality names
Housing_Data$Store.Municipality<- mgsub(c("GUE DE CONSTANTINE", "BAB EZZOUAR","BACHDJERRAH","HAMMAMET", "BIRKHADEM",
"BELOUIZDAD","BOLOGHINE","ALGER CENTRE", "CHEVALLEY","HRAOUA","TESSALA EL MERDJA","SAID HAMDINE","AIN NAADJA"),
c("DJISR KSENTINA","BEB EZZOUAR","BACH DJERRAH", "BAINS ROMAINS", "BIR KHADEM", "HAMMA ANASSERS",
"BOLOGHINE IBN ZIRI", "ALGER" ,"BOUZAREAH", "HARAOUA", "TASSALA EL MERDJA","BIR MOURAD RAIS","DJISR KSENTINA"),
Housing_Data$Store.Municipality)
#Getting the long and lat for Store Municipality
Housing_Data <-left_join(Housing_Data, select(commune.info_final,Municipality, Store.lon=lon, Store.lat=lat), by = c("Store.Municipality" = "Municipality"))
glimpse(Housing_Data)
Observations: 2,152
Variables: 50
$ Links <fctr> https://www.ouedkniss.com/vente-appartement-f2-alger-belouizdad-algerie-immobilier-d16427845, https://www.ouedkniss.com/vente-ap...
$ Title <fctr> Vente Appartement F2 Alger Belouizdad, Vente Appartement F2 Alger Birtouta, Vente Appartement F2 Alger Dar el beida, Vente Appar...
$ Description <fctr> NA, J'ai une grande,propre et top f2 très bien fini refait à neuf, meublé (cuisine et salle de bain équipée et aménager,wc, gran...
$ Store.Name <fctr> NA, NA, NA, NA, MS.Immobilier, NA, APISERVCES, APISERVCES, APISERVCES, APISERVCES, NA, NA, Bureau d'affaire Acs, NA, MT AFFAIRES...
$ Pseudo <fctr> youyayouya, F123kniss, megatrain21, kikoum, NA, sidaeli1230, NA, NA, NA, NA, mohamedsebaa, meriemchouch, NA, Ilyes_blh, NA, NA, ...
$ Store.Addresse <fctr> NA, NA, NA, NA, Alger Bab ezzouar bab ezzouar, NA, Alger El madania 27A, Rue Amar Boudlel, El-Madania- Alger, Alger El madania 2...
$ Date <date> 2018-07-26, 2018-07-26, 2018-07-26, 2018-07-26, 2018-07-26, 2018-07-26, 2018-07-26, 2018-07-26, 2018-07-26, 2018-07-26, 2018-07-...
$ Nb.Floor <int> NA, 3, 4, 1, 4, 3, 2, 3, 3, NA, 4, 2, 12345, NA, 4, 245, 2, 5, NA, 4, 2, NA, 4, 124, 1234, 12345, 4, NA, NA, 2, 3, 3, 3, 3, 5, 3,...
$ Nb.Room <ord> 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 5, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, ...
$ Nb.Views <int> 119, NA, 33, 381, 9, 39, 15, 17, 19, 19, 36, 11, 210, NA, 24, 5536, 16, NA, 95, 72, 8265, 42, 94, 9966, 3869, 4952, 13, 488, NA, ...
$ ID.Offer <int> 16427845, 16340043, 16600827, 16512113, 16603790, 16600079, 16602760, 16602876, 16602975, 16602956, 16599266, 16601484, 16335971,...
$ Price <fctr> 860 Millions Négociable, 1 Millions Négociable, 7 Millions Négociable, 900 Millions Offert, 2.4 Milliards Négociable, 750 Millio...
$ District <fctr> belcourt, Centre ville, Fatma nsoumer, Cité hayet, 1200 LOGTS, 13 åßÊÇÑ, les sources, la cadat, les vergeres, Résidance rahma, R...
$ Specifics <fctr> Electricité , Gaz , Eau , Acte notarié , Livret foncier, Electricité , Gaz , Eau , Meublé, Electricité , Gaz , Eau , P...
$ Area <dbl> 48, 70, 65, 42, 97, 70, 100, 136, 68, 88, 62, 59, 110, 59, 70, 80, 75, 84, 90, 80, 114, 100, 145, 123, 125, 81, 146, 68, 54, 37, ...
$ Price.value <dbl> 860.0000, 1.0000, 7.0000, 900.0000, 2.4000, 750.0000, 2.8000, 4.6000, 2.2000, 2.1000, 15.0000, 1.6000, 22.0000, 900.0000, 1.4000,...
$ Price.unit <fctr> Millions, Millions, Millions, Millions, Milliards, Millions, Milliards, Milliards, Milliards, Milliards, Millions, Millions, Mil...
$ Price.desc <fctr> Négociable, Négociable, Négociable, Offert, Négociable, Fixe, Négociable, Négociable, Négociable, Négociable, Négociable, Fixe, ...
$ Price.value.dzd <dbl> 8600000, 10000000, 70000000, 9000000, 24000000, 7500000, 28000000, 46000000, 22000000, 21000000, 15000000, 16000000, 22000000, 90...
$ Price.rules <chr> "Price.value * 10000", "Price.value * 10000000", "Price.value * 10000000", "Price.value * 10000", "Price.value *10000000", "Price...
$ Announcer.Name <chr> "youyayouya", "F123kniss", "megatrain21", "kikoum", "MS.Immobilier", "sidaeli1230", "APISERVCES", "APISERVCES", "APISERVCES", "AP...
$ Announcer.Type <chr> "PARTICULIER", "PARTICULIER", "PARTICULIER", "PARTICULIER", "BUREAU D'AFFAIRE", "PARTICULIER", "BUREAU D'AFFAIRE", "BUREAU D'AFFA...
$ Hour <chr> "19", "19", "19", "21", "22", "16", "19", "20", "20", "20", "14", "19", "22", "20", "19", "22", "20", "20", "16", "19", "22", "20...
$ Month <ord> Jul, Jul, Jul, Jul, Jul, Jul, Jul, Jul, Jul, Jul, Jul, Jul, Jul, Jul, Jul, Jul, Jul, Jul, Jul, Jul, Jul, Jul, Jul, Jul, Jul, Jul,...
$ Municipality <chr> "HAMMA ANASSERS", "BIRTOUTA", "DAR EL BEIDA", "DJISR KSENTINA", "BEB EZZOUAR", "BARAKI", "BIR MOURAD RAIS", "BIR MOURAD RAIS", "B...
$ Garage <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, NA, FALSE, NA, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, NA, FALSE, FALSE, FALSE,...
$ Garden <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, NA, FALSE, NA, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, NA, FALSE, FALSE, FALSE...
$ Furnished <lgl> FALSE, TRUE, FALSE, FALSE, FALSE, NA, FALSE, NA, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, NA, TRUE, FALSE, FALSE, ...
$ Promise <lgl> FALSE, FALSE, TRUE, FALSE, FALSE, NA, FALSE, NA, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, NA, FALSE, TRUE, FALSE, ...
$ New.Project <lgl> FALSE, FALSE, TRUE, FALSE, FALSE, NA, FALSE, NA, FALSE, FALSE, TRUE, FALSE, TRUE, FALSE, FALSE, FALSE, NA, FALSE, TRUE, FALSE, FA...
$ Payment <fctr> comptant, comptant, comptant, comptant, comptant, NA, comptant, NA, comptant, comptant, tranches, comptant, comptant, comptant, ...
$ Top_Municipalities <fctr> Other, Other, Other, DJISR KSENTINA, BEB EZZOUAR, Other, Other, Other, Other, BIR KHADEM, Other, ALGER, CHERAGA, CHERAGA, Other,...
$ Store.Municipality <chr> NA, NA, NA, NA, "BEB EZZOUAR", NA, "EL MADANIA", "EL MADANIA", "EL MADANIA", "EL MADANIA", NA, NA, "KOUBA", NA, "DAR EL BEIDA", "...
$ commune.names <fctr> Algiers, HAMMA ANASSERS, Algiers, BIRTOUTA, Algiers, DAR EL BEIDA, Algiers, DJISR KSENTINA, Algiers, BEB EZZOUAR, Algiers, BARAK...
$ lon <dbl> 3.058756, 3.047605, 3.228198, 3.079263, 3.185497, 3.098580, 3.050374, 3.050374, 3.050374, 3.042598, 3.012567, 3.055116, 2.922589,...
$ lat <dbl> 36.75377, 36.64609, 36.70601, 36.69788, 36.72063, 36.66730, 36.73535, 36.73535, 36.73535, 36.71627, 36.78164, 36.77248, 36.76235,...
$ type <chr> "locality", "locality", "locality", "locality", "locality", "locality", "locality", "locality", "locality", "locality", "locality...
$ loctype <chr> "approximate", "approximate", "approximate", "approximate", "approximate", "approximate", "approximate", "approximate", "approxim...
$ address <chr> "algiers, sidi m'hamed, algeria", "birtouta, algeria", "dar el beïda, algeria", "djasr kasentina, algeria", "bab ezzouar, algeria...
$ north <dbl> 36.75878, 36.66857, 36.73901, 36.71377, 36.74174, 36.70469, 36.74079, 36.74079, 36.74079, 36.73351, 36.80971, 36.78404, 36.78475,...
$ south <dbl> 36.74887, 36.61635, 36.66587, 36.67178, 36.70393, 36.63556, 36.71975, 36.71975, 36.71975, 36.69576, 36.76851, 36.75229, 36.73448,...
$ east <dbl> 3.061624, 3.071880, 3.259807, 3.114774, 3.206227, 3.131125, 3.064499, 3.064499, 3.064499, 3.073656, 3.041807, 3.076917, 2.983024,...
$ west <dbl> 3.054058, 2.947340, 3.163548, 3.042440, 3.166648, 3.065958, 3.022013, 3.022013, 3.022013, 3.023633, 2.980932, 3.038953, 2.873654,...
$ locality <chr> "Algiers", "Birtouta", "Dar El Beïda", "Djasr Kasentina", "Bab Ezzouar", "Baraki", "Bir Mourad Raïs", "Bir Mourad Raïs", "Bir Mou...
$ locality.1 <fctr> Sidi M'Hamed, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
$ administrative_area_level_1 <fctr> Algiers Province, Algiers Province, Algiers Province, Algiers Province, Algiers Province, Algiers Province, Algiers Province, Al...
$ country <fctr> Algeria, Algeria, Algeria, Algeria, Algeria, Algeria, Algeria, Algeria, Algeria, Algeria, Algeria, Algeria, Algeria, Algeria, Al...
$ political <fctr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
$ Store.lon <dbl> NA, NA, NA, NA, 3.185497, NA, 3.068890, 3.068890, 3.068890, 3.068890, NA, NA, 3.081495, NA, 3.228198, 3.081495, 3.068890, NA, NA,...
$ Store.lat <dbl> NA, NA, NA, NA, 36.72063, NA, 36.74118, 36.74118, 36.74118, 36.74118, NA, NA, 36.72667, NA, 36.70601, 36.72667, 36.74118, NA, NA,...